#Alexander F. Gazmararian
#afg2@princeton.edu
#January 9, 2024

#Purpose: Code union levels at the county leve

#Load packages
library(tidyverse)
library(tidylog)
library(here)
library(tidycensus)

#create file list
flist <- list.files(path= here("data", "input", "eia_7a"), pattern="coal", full.names=TRUE)
#load data from files
g <- lapply(flist, function(x) readxl::read_xls(x,skip=3))
gsub <- lapply(g, function(x) subset(x, select = c(Year, `MSHA ID`, `Mine State`,`Mine County`, `Mine Status`,`Production (short tons)`, `Union Code`)))
gsub <- do.call("rbind", gsub)
#subset to mines where there is no MSHA ID
gsub <- filter(gsub, `MSHA ID` != "-" & `MSHA ID` != "0")
names(gsub) <- tolower(names(gsub))
names(gsub) <- gsub(" ", "_", names(gsub))
gsub <- rename(gsub, prod = `production_(short_tons)`)
#create indicator for union or not
gsub <- gsub %>%
  mutate(
    union = case_when(
      union_code %in% c("Non-Union","-") | is.na(union_code) ~ "Non-Union",
      T ~ "Union"
    )
  )
#plot the number of unionized vs. non-unionized mines over time
p.issue <- gsub %>%
  group_by(year, union) %>%
  summarize(n=n()) %>%
  ggplot() + 
  geom_rect(aes(xmin=1986,xmax=1987,ymin=-0,ymax=4100),fill="red") +
  geom_line(aes(x=year,y=n,color=union,lty=union),size=1) +
  scale_y_continuous(limits = c(0,4100),expand=c(0,0)) +
  theme_bw(base_size=14) +
  labs(x="",y="Mines",color="",lty="") +
  scale_color_grey() +
  theme(
    panel.grid = element_blank(),
    legend.position = c(.85,.85),
    legend.background = element_rect(fill='transparent'),
    legend.box.background = element_rect(fill='transparent',color="transparent")
  )
#Figure H1
ggsave(
  plot = p.issue,
  filename = here("output", "figures", "si_fig_H1_mines_dataissue.png"),
  width = 5.5,
  height = 2.8,
  scale = 1.5,
  dpi = 300
)
#Check what happened in 1986 and 1987
u <- gsub %>%
  arrange(year,msha_id) %>%
  group_by(msha_id) %>%
  mutate(lagunion2 = ifelse(lag(union,2)=="Union",1,0),
         lagunion3 = ifelse(lag(union,3)=="Union",1,0),
         leadunion2 = ifelse(lead(union,2)=="Union",1,0),
         leadunion3 = ifelse(lead(union,3)=="Union",1,0))
u <- u %>%
  ungroup() %>%
  mutate(dataerror = case_when(
    union == "Non-Union" & lagunion2 == 1 & leadunion2 == 1 ~ 1,
    T ~ 0
    )
  )
table(u$year,u$dataerror)
u <- subset(u, select = c(msha_id, dataerror,year))
gsub <- left_join(gsub, u, by = c("msha_id", "year"))
#recode wrong mine as unionized if true
gsub <- gsub %>%
  mutate(
    union0 = case_when(
      union == "Union" ~ 1,
      dataerror == 1 ~ 1,
      T ~ 0
    )
  )
#filter out mines without counties listed
gsub <- filter(gsub, mine_county != "-")
#Figure H2: plot the number of unionized vs. nonunionized mines over time
p.fixed <- gsub %>%
  group_by(year, union0) %>%
  summarize(n=n()) %>%
  mutate(union0=ifelse(union0==1,"Union", "Non-Union")) %>%
  ggplot() + 
  geom_line(aes(x=year,y=n,color=union0,lty=union0),size=1) +
  theme_bw(base_size=14) +
  scale_y_continuous(limits = c(0,4100),expand=c(0,0)) +
  labs(x="",y="Mines",color="",lty="") +
  scale_color_grey() +
  theme(
    panel.grid = element_blank(),
    legend.position = c(.85,.85),
    legend.background = element_rect(fill='transparent'),
    legend.box.background = element_rect(fill='transparent',color="transparent")
  )
p.fixed
ggsave(
  plot = p.fixed,
  filename = here("output", "figures", "si_fig_H2_mines_datafix.png"),
  width = 5.5,
  height = 2.8,
  scale = 1.5,
  dpi = 300
)
#aggregate to the county level
gagg <- gsub %>%
  group_by(year,mine_state, mine_county, union0) %>%
  summarize(prod = sum(prod,na.rm=TRUE))
#fix state names
gagg$mine_state <- gsub("\\(East\\)|\\(West\\)|\\(Anthracite\\)|\\(Bituminous\\)|\\(Northern\\)|\\(Southern\\)","",gagg$mine_state)
gagg$mine_state <- trimws(gagg$mine_state)
gagg <- filter(gagg, mine_state != "Refuse Recovery")
table(gagg$mine_state)
#prep variables
gagg <- pivot_wider(gagg, names_from=union0,values_from=prod)
names(gagg)[c(4,5)] <- c("prod_nonunion", "prod_union")
gagg <- gagg %>% mutate(across(prod_nonunion:prod_union, ~ ifelse(is.na(.x),0,.x)))
#pair with fips codes
fips <- tidycensus::fips_codes
fips$county <- gsub("County|Parish|Borough", "", fips$county)
fips$county <- trimws(fips$county)
fips$county <- tolower(gsub("\\.","",fips$county))
gagg$mine_county <- tolower(gsub("\\.","",gagg$mine_county))
#fix missing counties
gagg[gagg$mine_county=="muhlenburg",]$mine_county <- "muhlenberg"
gagg[gagg$mine_county=="bighorn",]$mine_county <- "big horn"
gagg[gagg$mine_county=="crai",]$mine_county <- "craig"
gagg[gagg$mine_county=="clairborne",]$mine_county <- "claiborne"
gagg[gagg$mine_county=="raxton",]$mine_county <- "braxton"
gagg[gagg$mine_county=="athans",]$mine_county <- "athens"
gagg[gagg$mine_county=="yukon-koyukuk",]$mine_county <- "yukon-koyukuk census area"
gagg[gagg$mine_county=="de kalb",]$mine_county <- "dekalb"
gagg[gagg$mine_county=="tucker",]$mine_state <- "West Virginia" 
gagg[gagg$mine_county=="monongalia",]$mine_state <- "West Virginia" 
gagg[gagg$mine_county=="buchanan",]$mine_state <- "Virginia" 
#merge
m <- left_join(gagg,fips,by=c("mine_state"="state_name","mine_county"="county"))
#re-aggregate since some counties were off
m <- m %>%
  group_by(year,mine_state,mine_county,state_code, county_code) %>%
  summarize(across(prod_nonunion:prod_union, ~sum(.x, na.rm = TRUE)))
m$union <- ifelse(m$prod_union > 0, 1, 0)
m$unionshare <- with(m, prod_union / (prod_union + prod_nonunion))
m$unionshare <- ifelse(is.na(m$unionshare), 0, m$unionshare)
#create fips
m$fips <- paste0(m$state_code,m$county_code)
m$fips <- as.numeric(m$fips)
#Subset to relevant variables for analysis
m <- subset(m, select = c(year, fips, prod_nonunion:unionshare))
#save output
saveRDS(m, here("data", "inter", "eia7a.rds"))
